'textLength';
case 'viewbox':
return 'viewBox';
case 'viewtarget':
return 'viewTarget';
case 'xchannelselector':
return 'xChannelSelector';
case 'ychannelselector':
return 'yChannelSelector';
case 'zoomandpan':
return 'zoomAndPan';
}
}
if ( 'html' !== $namespace ) {
switch ( $lower_name ) {
case 'xlink:actuate':
return 'xlink actuate';
case 'xlink:arcrole':
return 'xlink arcrole';
case 'xlink:href':
return 'xlink href';
case 'xlink:role':
return 'xlink role';
case 'xlink:show':
return 'xlink show';
case 'xlink:title':
return 'xlink title';
case 'xlink:type':
return 'xlink type';
case 'xml:lang':
return 'xml lang';
case 'xml:space':
return 'xml space';
case 'xmlns':
return 'xmlns';
case 'xmlns:xlink':
return 'xmlns xlink';
}
}
return $attribute_name;
}
/**
* Indicates if the currently matched tag contains the self-closing flag.
*
* No HTML elements ought to have the self-closing flag and for those, the self-closing
* flag will be ignored. For void elements this is benign because they "self close"
* automatically. For non-void HTML elements though problems will appear if someone
* intends to use a self-closing element in place of that element with an empty body.
* For HTML foreign elements and custom elements the self-closing flag determines if
* they self-close or not.
*
* This function does not determine if a tag is self-closing,
* but only if the self-closing flag is present in the syntax.
*
* @since 6.3.0
*
* @return bool Whether the currently matched tag contains the self-closing flag.
*/
public function has_self_closing_flag(): bool {
if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
return false;
}
/*
* The self-closing flag is the solidus at the _end_ of the tag, not the beginning.
*
* Example:
*
*
* ^ this appears one character before the end of the closing ">".
*/
return '/' === $this->html[ $this->token_starts_at + $this->token_length - 2 ];
}
/**
* Indicates if the current tag token is a tag closer.
*
* Example:
*
* $p = new WP_HTML_Tag_Processor( '
' );
* $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) );
* $p->is_tag_closer() === false;
*
* $p->next_tag( array( 'tag_name' => 'div', 'tag_closers' => 'visit' ) );
* $p->is_tag_closer() === true;
*
* @since 6.2.0
* @since 6.7.0 Reports all BR tags as opening tags.
*
* @return bool Whether the current tag is a tag closer.
*/
public function is_tag_closer(): bool {
return (
self::STATE_MATCHED_TAG === $this->parser_state &&
$this->is_closing_tag &&
/*
* The BR tag can only exist as an opening tag. If something like ``
* appears then the HTML parser will treat it as an opening tag with no
* attributes. The BR tag is unique in this way.
*
* @see https://html.spec.whatwg.org/#parsing-main-inbody
*/
'BR' !== $this->get_tag()
);
}
/**
* Indicates the kind of matched token, if any.
*
* This differs from `get_token_name()` in that it always
* returns a static string indicating the type, whereas
* `get_token_name()` may return values derived from the
* token itself, such as a tag name or processing
* instruction tag.
*
* Possible values:
* - `#tag` when matched on a tag.
* - `#text` when matched on a text node.
* - `#cdata-section` when matched on a CDATA node.
* - `#comment` when matched on a comment.
* - `#doctype` when matched on a DOCTYPE declaration.
* - `#presumptuous-tag` when matched on an empty tag closer.
* - `#funky-comment` when matched on a funky comment.
*
* @since 6.5.0
*
* @return string|null What kind of token is matched, or null.
*/
public function get_token_type(): ?string {
switch ( $this->parser_state ) {
case self::STATE_MATCHED_TAG:
return '#tag';
case self::STATE_DOCTYPE:
return '#doctype';
default:
return $this->get_token_name();
}
}
/**
* Returns the node name represented by the token.
*
* This matches the DOM API value `nodeName`. Some values
* are static, such as `#text` for a text node, while others
* are dynamically generated from the token itself.
*
* Dynamic names:
* - Uppercase tag name for tag matches.
* - `html` for DOCTYPE declarations.
*
* Note that if the Tag Processor is not matched on a token
* then this function will return `null`, either because it
* hasn't yet found a token or because it reached the end
* of the document without matching a token.
*
* @since 6.5.0
*
* @return string|null Name of the matched token.
*/
public function get_token_name(): ?string {
switch ( $this->parser_state ) {
case self::STATE_MATCHED_TAG:
return $this->get_tag();
case self::STATE_TEXT_NODE:
return '#text';
case self::STATE_CDATA_NODE:
return '#cdata-section';
case self::STATE_COMMENT:
return '#comment';
case self::STATE_DOCTYPE:
return 'html';
case self::STATE_PRESUMPTUOUS_TAG:
return '#presumptuous-tag';
case self::STATE_FUNKY_COMMENT:
return '#funky-comment';
}
return null;
}
/**
* Indicates what kind of comment produced the comment node.
*
* Because there are different kinds of HTML syntax which produce
* comments, the Tag Processor tracks and exposes this as a type
* for the comment. Nominally only regular HTML comments exist as
* they are commonly known, but a number of unrelated syntax errors
* also produce comments.
*
* @see self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT
* @see self::COMMENT_AS_CDATA_LOOKALIKE
* @see self::COMMENT_AS_INVALID_HTML
* @see self::COMMENT_AS_HTML_COMMENT
* @see self::COMMENT_AS_PI_NODE_LOOKALIKE
*
* @since 6.5.0
*
* @return string|null
*/
public function get_comment_type(): ?string {
if ( self::STATE_COMMENT !== $this->parser_state ) {
return null;
}
return $this->comment_type;
}
/**
* Returns the text of a matched comment or null if not on a comment type node.
*
* This method returns the entire text content of a comment node as it
* would appear in the browser.
*
* This differs from {@see ::get_modifiable_text()} in that certain comment
* types in the HTML API cannot allow their entire comment text content to
* be modified. Namely, "bogus comments" of the form ``
* will create a comment whose text content starts with `?`. Note that if
* that character were modified, it would be possible to change the node
* type.
*
* @since 6.7.0
*
* @return string|null The comment text as it would appear in the browser or null
* if not on a comment type node.
*/
public function get_full_comment_text(): ?string {
if ( self::STATE_FUNKY_COMMENT === $this->parser_state ) {
return $this->get_modifiable_text();
}
if ( self::STATE_COMMENT !== $this->parser_state ) {
return null;
}
switch ( $this->get_comment_type() ) {
case self::COMMENT_AS_HTML_COMMENT:
case self::COMMENT_AS_ABRUPTLY_CLOSED_COMMENT:
return $this->get_modifiable_text();
case self::COMMENT_AS_CDATA_LOOKALIKE:
return "[CDATA[{$this->get_modifiable_text()}]]";
case self::COMMENT_AS_PI_NODE_LOOKALIKE:
return "?{$this->get_tag()}{$this->get_modifiable_text()}?";
/*
* This represents "bogus comments state" from HTML tokenization.
* This can be entered by `` or `html[ $this->text_starts_at - 1 ];
$comment_start = '?' === $preceding_character ? '?' : '';
return "{$comment_start}{$this->get_modifiable_text()}";
}
return null;
}
/**
* Subdivides a matched text node, splitting NULL byte sequences and decoded whitespace as
* distinct nodes prefixes.
*
* Note that once anything that's neither a NULL byte nor decoded whitespace is
* encountered, then the remainder of the text node is left intact as generic text.
*
* - The HTML Processor uses this to apply distinct rules for different kinds of text.
* - Inter-element whitespace can be detected and skipped with this method.
*
* Text nodes aren't eagerly subdivided because there's no need to split them unless
* decisions are being made on NULL byte sequences or whitespace-only text.
*
* Example:
*
* $processor = new WP_HTML_Tag_Processor( "\x00Apples & Oranges" );
* true === $processor->next_token(); // Text is "Apples & Oranges".
* true === $processor->subdivide_text_appropriately(); // Text is "".
* true === $processor->next_token(); // Text is "Apples & Oranges".
* false === $processor->subdivide_text_appropriately();
*
* $processor = new WP_HTML_Tag_Processor( " \r\n\tMore" );
* true === $processor->next_token(); // Text is " ␉More".
* true === $processor->subdivide_text_appropriately(); // Text is " ␉".
* true === $processor->next_token(); // Text is "More".
* false === $processor->subdivide_text_appropriately();
*
* @since 6.7.0
*
* @return bool Whether the text node was subdivided.
*/
public function subdivide_text_appropriately(): bool {
if ( self::STATE_TEXT_NODE !== $this->parser_state ) {
return false;
}
$this->text_node_classification = self::TEXT_IS_GENERIC;
/*
* NULL bytes are treated categorically different than numeric character
* references whose number is zero. `` is not the same as `"\x00"`.
*/
$leading_nulls = strspn( $this->html, "\x00", $this->text_starts_at, $this->text_length );
if ( $leading_nulls > 0 ) {
$this->token_length = $leading_nulls;
$this->text_length = $leading_nulls;
$this->bytes_already_parsed = $this->token_starts_at + $leading_nulls;
$this->text_node_classification = self::TEXT_IS_NULL_SEQUENCE;
return true;
}
/*
* Start a decoding loop to determine the point at which the
* text subdivides. This entails raw whitespace bytes and any
* character reference that decodes to the same.
*/
$at = $this->text_starts_at;
$end = $this->text_starts_at + $this->text_length;
while ( $at < $end ) {
$skipped = strspn( $this->html, " \t\f\r\n", $at, $end - $at );
$at += $skipped;
if ( $at < $end && '&' === $this->html[ $at ] ) {
$matched_byte_length = null;
$replacement = WP_HTML_Decoder::read_character_reference( 'data', $this->html, $at, $matched_byte_length );
if ( isset( $replacement ) && 1 === strspn( $replacement, " \t\f\r\n" ) ) {
$at += $matched_byte_length;
continue;
}
}
break;
}
if ( $at > $this->text_starts_at ) {
$new_length = $at - $this->text_starts_at;
$this->text_length = $new_length;
$this->token_length = $new_length;
$this->bytes_already_parsed = $at;
$this->text_node_classification = self::TEXT_IS_WHITESPACE;
return true;
}
return false;
}
/**
* Returns the modifiable text for a matched token, or an empty string.
*
* Modifiable text is text content that may be read and changed without
* changing the HTML structure of the document around it. This includes
* the contents of `#text` nodes in the HTML as well as the inner
* contents of HTML comments, Processing Instructions, and others, even
* though these nodes aren't part of a parsed DOM tree. They also contain
* the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
* other section in an HTML document which cannot contain HTML markup (DATA).
*
* If a token has no modifiable text then an empty string is returned to
* avoid needless crashing or type errors. An empty string does not mean
* that a token has modifiable text, and a token with modifiable text may
* have an empty string (e.g. a comment with no contents).
*
* Limitations:
*
* - This function will not strip the leading newline appropriately
* after seeking into a LISTING or PRE element. To ensure that the
* newline is treated properly, seek to the LISTING or PRE opening
* tag instead of to the first text node inside the element.
*
* @since 6.5.0
* @since 6.7.0 Replaces NULL bytes (U+0000) and newlines appropriately.
*
* @return string
*/
public function get_modifiable_text(): string {
$has_enqueued_update = isset( $this->lexical_updates['modifiable text'] );
if ( ! $has_enqueued_update && ( null === $this->text_starts_at || 0 === $this->text_length ) ) {
return '';
}
$text = $has_enqueued_update
? $this->lexical_updates['modifiable text']->text
: substr( $this->html, $this->text_starts_at, $this->text_length );
/*
* Pre-processing the input stream would normally happen before
* any parsing is done, but deferring it means it's possible to
* skip in most cases. When getting the modifiable text, however
* it's important to apply the pre-processing steps, which is
* normalizing newlines.
*
* @see https://html.spec.whatwg.org/#preprocessing-the-input-stream
* @see https://infra.spec.whatwg.org/#normalize-newlines
*/
$text = str_replace( "\r\n", "\n", $text );
$text = str_replace( "\r", "\n", $text );
// Comment data is not decoded.
if (
self::STATE_CDATA_NODE === $this->parser_state ||
self::STATE_COMMENT === $this->parser_state ||
self::STATE_DOCTYPE === $this->parser_state ||
self::STATE_FUNKY_COMMENT === $this->parser_state
) {
return str_replace( "\x00", "\u{FFFD}", $text );
}
$tag_name = $this->get_token_name();
if (
// Script data is not decoded.
'SCRIPT' === $tag_name ||
// RAWTEXT data is not decoded.
'IFRAME' === $tag_name ||
'NOEMBED' === $tag_name ||
'NOFRAMES' === $tag_name ||
'STYLE' === $tag_name ||
'XMP' === $tag_name
) {
return str_replace( "\x00", "\u{FFFD}", $text );
}
$decoded = WP_HTML_Decoder::decode_text_node( $text );
/*
* Skip the first line feed after LISTING, PRE, and TEXTAREA opening tags.
*
* Note that this first newline may come in the form of a character
* reference, such as `
`, and so it's important to perform
* this transformation only after decoding the raw text content.
*/
if (
( "\n" === ( $decoded[0] ?? '' ) ) &&
( ( $this->skip_newline_at === $this->token_starts_at && '#text' === $tag_name ) || 'TEXTAREA' === $tag_name )
) {
$decoded = substr( $decoded, 1 );
}
/*
* Only in normative text nodes does the NULL byte (U+0000) get removed.
* In all other contexts it's replaced by the replacement character (U+FFFD)
* for security reasons (to avoid joining together strings that were safe
* when separated, but not when joined).
*
* @todo Inside HTML integration points and MathML integration points, the
* text is processed according to the insertion mode, not according
* to the foreign content rules. This should strip the NULL bytes.
*/
return ( '#text' === $tag_name && 'html' === $this->get_namespace() )
? str_replace( "\x00", '', $decoded )
: str_replace( "\x00", "\u{FFFD}", $decoded );
}
/**
* Sets the modifiable text for the matched token, if matched.
*
* Modifiable text is text content that may be read and changed without
* changing the HTML structure of the document around it. This includes
* the contents of `#text` nodes in the HTML as well as the inner
* contents of HTML comments, Processing Instructions, and others, even
* though these nodes aren't part of a parsed DOM tree. They also contain
* the contents of SCRIPT and STYLE tags, of TEXTAREA tags, and of any
* other section in an HTML document which cannot contain HTML markup (DATA).
*
* Not all modifiable text may be set by this method, and not all content
* may be set as modifiable text. In the case that this fails it will return
* `false` indicating as much. For instance, it will not allow inserting the
* string `next_tag( 'STYLE' ) ) {
* $style = $processor->get_modifiable_text();
* $processor->set_modifiable_text( "// Made with love on the World Wide Web\n{$style}" );
* }
*
* // Replace smiley text with Emoji smilies.
* while ( $processor->next_token() ) {
* if ( '#text' !== $processor->get_token_name() ) {
* continue;
* }
*
* $chunk = $processor->get_modifiable_text();
* if ( ! str_contains( $chunk, ':)' ) ) {
* continue;
* }
*
* $processor->set_modifiable_text( str_replace( ':)', '🙂', $chunk ) );
* }
*
* This function handles all necessary HTML encoding. Provide normal, unescaped string values.
* The HTML API will encode the strings appropriately so that the browser will interpret them
* as the intended value.
*
* Example:
*
* // Renders as “Eggs & Milk” in a browser, encoded as `
Eggs & Milk
`.
* $processor->set_modifiable_text( 'Eggs & Milk' );
*
* // Renders as “Eggs & Milk” in a browser, encoded as `
Eggs & Milk
`.
* $processor->set_modifiable_text( 'Eggs & Milk' );
*
* @since 6.7.0
* @since 6.9.0 Escapes all character references instead of trying to avoid double-escaping.
*
* @param string $plaintext_content New text content to represent in the matched token.
* @return bool Whether the text was able to update.
*/
public function set_modifiable_text( string $plaintext_content ): bool {
if ( self::STATE_TEXT_NODE === $this->parser_state ) {
$this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
$this->text_starts_at,
$this->text_length,
strtr(
$plaintext_content,
array(
'<' => '<',
'>' => '>',
'&' => '&',
'"' => '"',
"'" => ''',
)
)
);
return true;
}
// Comment data is not encoded.
if (
self::STATE_COMMENT === $this->parser_state &&
self::COMMENT_AS_HTML_COMMENT === $this->comment_type
) {
// Check if the text could close the comment.
if ( 1 === preg_match( '/--!?>/', $plaintext_content ) ) {
return false;
}
$this->lexical_updates['modifiable text'] = new WP_HTML_Text_Replacement(
$this->text_starts_at,
$this->text_length,
$plaintext_content
);
return true;
}
if ( self::STATE_MATCHED_TAG !== $this->parser_state ) {
return false;
}
switch ( $this->get_tag() ) {
case 'SCRIPT':
/**
* This is over-protective, but ensures the update doesn't break
* the HTML structure of the SCRIPT element.
*
* More thorough analysis could track the HTML tokenizer states
* and to ensure that the SCRIPT element closes at the expected
* SCRIPT close tag as is done in {@see ::skip_script_data()}.
*
* A SCRIPT element could be closed prematurely by contents
* like ``. A SCRIPT element could be prevented from
* closing by contents like `
*
*
* @since 6.5.0
*/
const COMMENT_AS_ABRUPTLY_CLOSED_COMMENT = 'COMMENT_AS_ABRUPTLY_CLOSED_COMMENT';
/**
* Indicates that a comment would be parsed as a CDATA node,
* were HTML to allow CDATA nodes outside of foreign content.
*
* Example:
*
*
*
* This is an HTML comment, but it looks like a CDATA node.
*
* @since 6.5.0
*/
const COMMENT_AS_CDATA_LOOKALIKE = 'COMMENT_AS_CDATA_LOOKALIKE';
/**
* Indicates that a comment was created when encountering
* normative HTML comment syntax.
*
* Example:
*
*
*
* @since 6.5.0
*/
const COMMENT_AS_HTML_COMMENT = 'COMMENT_AS_HTML_COMMENT';
/**
* Indicates that a comment would be parsed as a Processing
* Instruction node, were they to exist within HTML.
*
* Example:
*
*
*
* This is an HTML comment, but it looks like a CDATA node.
*
* @since 6.5.0
*/
const COMMENT_AS_PI_NODE_LOOKALIKE = 'COMMENT_AS_PI_NODE_LOOKALIKE';
/**
* Indicates that a comment was created when encountering invalid
* HTML input, a so-called "bogus comment."
*
* Example:
*
*
*
*
* @since 6.5.0
*/
const COMMENT_AS_INVALID_HTML = 'COMMENT_AS_INVALID_HTML';
/**
* No-quirks mode document compatibility mode.
*
* > In no-quirks mode, the behavior is (hopefully) the desired behavior
* > described by the modern HTML and CSS specifications.
*
* @see self::$compat_mode
* @see https://developer.mozilla.org/en-US/docs/Web/HTML/Quirks_Mode_and_Standards_Mode
*
* @since 6.7.0
*
* @var string
*/
const NO_QUIRKS_MODE = 'no-quirks-mode';
/**
* Quirks mode document compatibility mode.
*
* > In quirks mode, layout emulates behavior in Navigator 4 and Internet
* > Explorer 5. This is essential in order to support websites that were
* > built before the widespread adoption of web standards.
*
* @see self::$compat_mode
* @see https://developer.mozilla.org/en-US/docs/Web/HTML/Quirks_Mode_and_Standards_Mode
*
* @since 6.7.0
*
* @var string
*/
const QUIRKS_MODE = 'quirks-mode';
/**
* Indicates that a span of text may contain any combination of significant
* kinds of characters: NULL bytes, whitespace, and others.
*
* @see self::$text_node_classification
* @see self::subdivide_text_appropriately
*
* @since 6.7.0
*/
const TEXT_IS_GENERIC = 'TEXT_IS_GENERIC';
/**
* Indicates that a span of text comprises a sequence only of NULL bytes.
*
* @see self::$text_node_classification
* @see self::subdivide_text_appropriately
*
* @since 6.7.0
*/
const TEXT_IS_NULL_SEQUENCE = 'TEXT_IS_NULL_SEQUENCE';
/**
* Indicates that a span of decoded text comprises only whitespace.
*
* @see self::$text_node_classification
* @see self::subdivide_text_appropriately
*
* @since 6.7.0
*/
const TEXT_IS_WHITESPACE = 'TEXT_IS_WHITESPACE';
/**
* Wakeup magic method.
*
* @since 6.9.2
*/
public function __wakeup() {
throw new \LogicException( __CLASS__ . ' should never be unserialized' );
}
}